/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "dictbuilder.h"
#include "dicttrie.h"
#include "mystdlib.h"
#include "ngram.h"
#include "searchutility.h"
#include "spellingtable.h"
#include "spellingtrie.h"
#include "splparser.h"
#include "utf16reader.h"

namespace ime_pinyin
{

#ifdef ___BUILD_MODEL___

	static const size_t kReadBufLen = 512;
	static const size_t kSplTableHashLen = 2000;

	// Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by
	// frequencies.
	int cmp_scis_hz_splid_freq(const void *p1, const void *p2)
	{
		const SingleCharItem *s1, *s2;
		s1 = static_cast<const SingleCharItem *>(p1);
		s2 = static_cast<const SingleCharItem *>(p2);

		if (s1->hz < s2->hz) {
			return -1;
		}
		if (s1->hz > s2->hz) {
			return 1;
		}

		if (s1->splid.half_splid < s2->splid.half_splid) {
			return -1;
		}
		if (s1->splid.half_splid > s2->splid.half_splid) {
			return 1;
		}

		if (s1->splid.full_splid < s2->splid.full_splid) {
			return -1;
		}
		if (s1->splid.full_splid > s2->splid.full_splid) {
			return 1;
		}

		if (s1->freq > s2->freq) {
			return -1;
		}
		if (s1->freq < s2->freq) {
			return 1;
		}
		return 0;
	}

	int cmp_scis_hz_splid(const void *p1, const void *p2)
	{
		const SingleCharItem *s1, *s2;
		s1 = static_cast<const SingleCharItem *>(p1);
		s2 = static_cast<const SingleCharItem *>(p2);

		if (s1->hz < s2->hz) {
			return -1;
		}
		if (s1->hz > s2->hz) {
			return 1;
		}

		if (s1->splid.half_splid < s2->splid.half_splid) {
			return -1;
		}
		if (s1->splid.half_splid > s2->splid.half_splid) {
			return 1;
		}

		if (s1->splid.full_splid < s2->splid.full_splid) {
			return -1;
		}
		if (s1->splid.full_splid > s2->splid.full_splid) {
			return 1;
		}

		return 0;
	}

	int cmp_lemma_entry_hzs(const void *p1, const void *p2)
	{
		size_t size1 = utf16_strlen(((const LemmaEntry *)p1)->hanzi_str);
		size_t size2 = utf16_strlen(((const LemmaEntry *)p2)->hanzi_str);
		if (size1 < size2) {
			return -1;
		} else if (size1 > size2) {
			return 1;
		}

		return utf16_strcmp(((const LemmaEntry *)p1)->hanzi_str,
		                    ((const LemmaEntry *)p2)->hanzi_str);
	}

	int compare_char16(const void *p1, const void *p2)
	{
		if (*((const char16 *)p1) < * ((const char16 *)p2)) {
			return -1;
		}
		if (*((const char16 *)p1) > *((const char16 *)p2)) {
			return 1;
		}
		return 0;
	}

	int compare_py(const void *p1, const void *p2)
	{
		int ret = utf16_strcmp(((const LemmaEntry *)p1)->spl_idx_arr,
		                       ((const LemmaEntry *)p2)->spl_idx_arr);

		if (0 != ret) {
			return ret;
		}

		return static_cast<int>(((const LemmaEntry *)p2)->freq) -
		       static_cast<int>(((const LemmaEntry *)p1)->freq);
	}

	// First hanzi, if the same, then Pinyin
	int cmp_lemma_entry_hzspys(const void *p1, const void *p2)
	{
		size_t size1 = utf16_strlen(((const LemmaEntry *)p1)->hanzi_str);
		size_t size2 = utf16_strlen(((const LemmaEntry *)p2)->hanzi_str);
		if (size1 < size2) {
			return -1;
		} else if (size1 > size2) {
			return 1;
		}
		int ret = utf16_strcmp(((const LemmaEntry *)p1)->hanzi_str,
		                       ((const LemmaEntry *)p2)->hanzi_str);

		if (0 != ret) {
			return ret;
		}

		ret = utf16_strcmp(((const LemmaEntry *)p1)->spl_idx_arr,
		                   ((const LemmaEntry *)p2)->spl_idx_arr);
		return ret;
	}

	int compare_splid2(const void *p1, const void *p2)
	{
		int ret = utf16_strcmp(((const LemmaEntry *)p1)->spl_idx_arr,
		                       ((const LemmaEntry *)p2)->spl_idx_arr);
		return ret;
	}

	DictBuilder::DictBuilder()
	{
		lemma_arr_ = NULL;
		lemma_num_ = 0;

		scis_ = NULL;
		scis_num_ = 0;

		lma_nodes_le0_ = NULL;
		lma_nodes_ge1_ = NULL;

		lma_nds_used_num_le0_ = 0;
		lma_nds_used_num_ge1_ = 0;

		homo_idx_buf_ = NULL;
		homo_idx_num_eq1_ = 0;
		homo_idx_num_gt1_ = 0;

		top_lmas_ = NULL;
		top_lmas_num_ = 0;

		spl_table_ = NULL;
		spl_parser_ = NULL;
	}

	DictBuilder::~DictBuilder()
	{
		free_resource();
	}

	bool DictBuilder::alloc_resource(size_t lma_num)
	{
		if (0 == lma_num) {
			return false;
		}

		free_resource();

		lemma_num_ = lma_num;
		lemma_arr_ = new LemmaEntry[lemma_num_];

		top_lmas_num_ = 0;
		top_lmas_ = new LemmaEntry[kTopScoreLemmaNum];

		// New the scis_ buffer to the possible maximum size.
		scis_num_ = lemma_num_ * kMaxLemmaSize;
		scis_ = new SingleCharItem[scis_num_];

		// The root and first level nodes is less than kMaxSpellingNum + 1
		lma_nds_used_num_le0_ = 0;
		lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1];

		// Other nodes is less than lemma_num
		lma_nds_used_num_ge1_ = 0;
		lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_];

		homo_idx_buf_ = new LemmaIdType[lemma_num_];
		spl_table_ = new SpellingTable();
		spl_parser_ = new SpellingParser();

		if (NULL == lemma_arr_ || NULL == top_lmas_ ||
		        NULL == scis_ || NULL == spl_table_ ||
		        NULL == spl_parser_ || NULL == lma_nodes_le0_ ||
		        NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) {
			free_resource();
			return false;
		}

		memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_);
		memset(scis_, 0, sizeof(SingleCharItem) * scis_num_);
		memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1));
		memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_);
		memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_);
		spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true);

		return true;
	}

	char16 *DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num)
	{
		if (NULL == fn_validhzs || NULL == num) {
			return NULL;
		}

		*num = 0;
		FILE *fp = fopen(fn_validhzs, "rb");
		if (NULL == fp) {
			return NULL;
		}

		char16 utf16header;
		if (fread(&utf16header, sizeof(char16), 1, fp) != 1 ||
		        0xfeff != utf16header) {
			fclose(fp);
			return NULL;
		}

		fseek(fp, 0, SEEK_END);
		*num = ftell(fp) / sizeof(char16);
		assert(*num >= 1);
		*num -= 1;

		char16 *hzs = new char16[*num];
		if (NULL == hzs) {
			fclose(fp);
			return NULL;
		}

		fseek(fp, 2, SEEK_SET);

		if (fread(hzs, sizeof(char16), *num, fp) != *num) {
			fclose(fp);
			delete [] hzs;
			return NULL;
		}
		fclose(fp);

		myqsort(hzs, *num, sizeof(char16), compare_char16);
		return hzs;
	}

	bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len,
	                                    char16 hz)
	{
		if (NULL == hzs) {
			return false;
		}

		char16 *found;
		found = static_cast<char16 *>(
		            mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16));
		if (NULL == found) {
			return false;
		}

		assert(*found == hz);
		return true;
	}

	// The caller makes sure that the parameters are valid.
	bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
	                                     const char16 *str, size_t str_len)
	{
		if (NULL == hzs || NULL == str) {
			return false;
		}

		for (size_t pos = 0; pos < str_len; pos++) {
			if (!hz_in_hanzis_list(hzs, hzs_len, str[pos])) {
				return false;
			}
		}
		return true;
	}

	void DictBuilder::get_top_lemmas()
	{
		top_lmas_num_ = 0;
		if (NULL == lemma_arr_) {
			return;
		}

		for (size_t pos = 0; pos < lemma_num_; pos++) {
			if (0 == top_lmas_num_) {
				top_lmas_[0] = lemma_arr_[pos];
				top_lmas_num_ = 1;
				continue;
			}

			if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) {
				if (kTopScoreLemmaNum > top_lmas_num_) {
					top_lmas_num_ += 1;
				}

				size_t move_pos;
				for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) {
					top_lmas_[move_pos] = top_lmas_[move_pos - 1];
					if (0 == move_pos - 1 ||
					        (move_pos - 1 > 0 &&
					         top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) {
						break;
					}
				}
				assert(move_pos > 0);
				top_lmas_[move_pos - 1] = lemma_arr_[pos];
			} else if (kTopScoreLemmaNum > top_lmas_num_) {
				top_lmas_[top_lmas_num_] = lemma_arr_[pos];
				top_lmas_num_ += 1;
			}
		}

		if (kPrintDebug0) {
			printf("\n------Top Lemmas------------------\n");
			for (size_t pos = 0; pos < top_lmas_num_; pos++) {
				printf("--%zd, idx:%06zd, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz,
				       top_lmas_[pos].freq);
			}
		}
	}

	void DictBuilder::free_resource()
	{
		if (NULL != lemma_arr_) {
			delete [] lemma_arr_;
		}

		if (NULL != scis_) {
			delete [] scis_;
		}

		if (NULL != lma_nodes_le0_) {
			delete [] lma_nodes_le0_;
		}

		if (NULL != lma_nodes_ge1_) {
			delete [] lma_nodes_ge1_;
		}

		if (NULL != homo_idx_buf_) {
			delete [] homo_idx_buf_;
		}

		if (NULL != spl_table_) {
			delete spl_table_;
		}

		if (NULL != spl_parser_) {
			delete spl_parser_;
		}

		lemma_arr_ = NULL;
		scis_ = NULL;
		lma_nodes_le0_ = NULL;
		lma_nodes_ge1_ = NULL;
		homo_idx_buf_ = NULL;
		spl_table_ = NULL;
		spl_parser_ = NULL;

		lemma_num_ = 0;
		lma_nds_used_num_le0_ = 0;
		lma_nds_used_num_ge1_ = 0;
		homo_idx_num_eq1_ = 0;
		homo_idx_num_gt1_ = 0;
	}

	size_t DictBuilder::read_raw_dict(const char *fn_raw,
	                                  const char *fn_validhzs,
	                                  size_t max_item)
	{
		if (NULL == fn_raw) {
			return 0;
		}

		Utf16Reader utf16_reader;
		if (!utf16_reader.open(fn_raw, kReadBufLen * 10)) {
			return false;
		}

		char16 read_buf[kReadBufLen];

		// Read the number of lemmas in the file
		size_t lemma_num = 240000;

		// allocate resource required
		if (!alloc_resource(lemma_num)) {
			utf16_reader.close();
		}

		// Read the valid Hanzi list.
		char16 *valid_hzs = NULL;
		size_t valid_hzs_num = 0;
		valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num);

		// Begin reading the lemma entries
		for (size_t i = 0; i < max_item; i++) {
			// read next entry
			if (!utf16_reader.readline(read_buf, kReadBufLen)) {
				lemma_num = i;
				break;
			}

			size_t token_size;
			char16 *token;
			char16 *to_tokenize = read_buf;

			// Get the Hanzi string
			token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
			if (NULL == token) {
				free_resource();
				utf16_reader.close();
				return false;
			}

			size_t lemma_size = utf16_strlen(token);

			if (lemma_size > kMaxLemmaSize) {
				i--;
				continue;
			}

			if (lemma_size > 4) {
				i--;
				continue;
			}

			// Copy to the lemma entry
			utf16_strcpy(lemma_arr_[i].hanzi_str, token);

			lemma_arr_[i].hz_str_len = token_size;

			// Get the freq string
			token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
			if (NULL == token) {
				free_resource();
				utf16_reader.close();
				return false;
			}
			lemma_arr_[i].freq = utf16_atof(token);

			if (lemma_size > 1 && lemma_arr_[i].freq < 60) {
				i--;
				continue;
			}

			// Get GBK mark, if no valid Hanzi list available, all items which contains
			// GBK characters will be discarded. Otherwise, all items which contains
			// characters outside of the valid Hanzi list will be discarded.
			token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
			assert(NULL != token);
			int gbk_flag = utf16_atoi(token);
			if (NULL == valid_hzs || 0 == valid_hzs_num) {
				if (0 != gbk_flag) {
					i--;
					continue;
				}
			} else {
				if (!str_in_hanzis_list(valid_hzs, valid_hzs_num,
				                        lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) {
					i--;
					continue;
				}
			}

			// Get spelling String
			bool spelling_not_support = false;
			for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
			        hz_pos++) {
				// Get a Pinyin
				token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
				if (NULL == token) {
					free_resource();
					utf16_reader.close();
					return false;
				}

				assert(utf16_strlen(token) <= kMaxPinyinSize);

				utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token);

				format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]);

				// Put the pinyin to the spelling table
				if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos],
				                              lemma_arr_[i].freq)) {
					spelling_not_support = true;
					break;
				}
			}

			// The whole line must have been parsed fully, otherwise discard this one.
			token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
			if (spelling_not_support || NULL != token) {
				i--;
				continue;
			}
		}

		delete [] valid_hzs;
		utf16_reader.close();

		printf("read successfully, lemma num: %zd\n", lemma_num);

		return lemma_num;
	}

	bool DictBuilder::build_dict(const char *fn_raw,
	                             const char *fn_validhzs,
	                             DictTrie *dict_trie)
	{
		if (NULL == fn_raw || NULL == dict_trie) {
			return false;
		}

		lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000);
		if (0 == lemma_num_) {
			return false;
		}

		// Arrange the spelling table, and build a spelling tree
		// The size of an spelling. '\0' is included. If the spelling table is
		// initialized to calculate the spelling scores, the last char in the
		// spelling string will be score, and it is also included in spl_item_size.
		size_t spl_item_size;
		size_t spl_num;
		const char *spl_buf;
		spl_buf = spl_table_->arrange(&spl_item_size, &spl_num);
		if (NULL == spl_buf) {
			free_resource();
			return false;
		}

		SpellingTrie &spl_trie = SpellingTrie::get_instance();

		if (!spl_trie.construct(spl_buf, spl_item_size, spl_num,
		                        spl_table_->get_score_amplifier(),
		                        spl_table_->get_average_score())) {
			free_resource();
			return false;
		}

		printf("spelling tree construct successfully.\n");

		// Convert the spelling string to idxs
		for (size_t i = 0; i < lemma_num_; i++) {
			for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
			        hz_pos++) {
				uint16 spl_idxs[2];
				uint16 spl_start_pos[3];
				bool is_pre = true;
				int spl_idx_num =
				    spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos],
				                                strlen(lemma_arr_[i].pinyin_str[hz_pos]),
				                                spl_idxs, spl_start_pos, 2, is_pre);
				assert(1 == spl_idx_num);

				if (spl_trie.is_half_id(spl_idxs[0])) {
					uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs);
					assert(0 != num);
				}
				lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0];
			}
		}

		// Sort the lemma items according to the hanzi, and give each unique item a
		// id
		sort_lemmas_by_hz();

		scis_num_ = build_scis();

		// Construct the dict list
		dict_trie->dict_list_ = new DictList();
		bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_,
		                  lemma_arr_, lemma_num_);
		assert(dl_success);

		// Construct the NGram information
		NGram &ngram = NGram::get_instance();
		ngram.build_unigram(lemma_arr_, lemma_num_,
		                    lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);

		// sort the lemma items according to the spelling idx string
		myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);

		get_top_lemmas();

#ifdef ___DO_STATISTICS___
		stat_init();
#endif

		lma_nds_used_num_le0_ = 1;  // The root node
		bool dt_success = construct_subset(static_cast<void *>(lma_nodes_le0_),
		                                   lemma_arr_, 0, lemma_num_, 0);
		if (!dt_success) {
			free_resource();
			return false;
		}

#ifdef ___DO_STATISTICS___
		stat_print();
#endif

		// Move the node data and homo data to the DictTrie
		dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_];
		dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_];
		size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_;
		dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize];
		assert(NULL != dict_trie->root_);
		assert(NULL != dict_trie->lma_idx_buf_);
		dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_;
		dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_;
		dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize;
		dict_trie->top_lmas_num_ = top_lmas_num_;

		memcpy(dict_trie->root_, lma_nodes_le0_,
		       sizeof(LmaNodeLE0) * lma_nds_used_num_le0_);
		memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_,
		       sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_);

		for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) {
			id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize,
			              homo_idx_buf_[pos]);
		}

		for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_;
		        pos < lma_idx_num; pos++) {
			LemmaIdType idx =
			    top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz;
			id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx);
		}

		if (kPrintDebug0) {
			printf("homo_idx_num_eq1_: %zd\n", homo_idx_num_eq1_);
			printf("homo_idx_num_gt1_: %zd\n", homo_idx_num_gt1_);
			printf("top_lmas_num_: %zd\n", top_lmas_num_);
		}

		free_resource();

		if (kPrintDebug0) {
			printf("Building dict succeds\n");
		}
		return dt_success;
	}

	void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id)
	{
		if (NULL == buf) {
			return;
		}
		for (size_t pos = 0; pos < kLemmaIdSize; pos++) {
			(buf)[pos] = (unsigned char)(id >> (pos * 8));
		}
	}

	void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset)
	{
		node->son_1st_off_l = static_cast<uint16>(offset);
		node->son_1st_off_h = static_cast<unsigned char>(offset >> 16);
	}

	void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset)
	{
		node->homo_idx_buf_off_l = static_cast<uint16>(offset);
		node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16);

	}

	// All spelling strings will be converted to upper case, except that
	// spellings started with "ZH"/"CH"/"SH" will be converted to
	// "Zh"/"Ch"/"Sh"
	void DictBuilder::format_spelling_str(char *spl_str)
	{
		if (NULL == spl_str) {
			return;
		}

		uint16 pos = 0;
		while ('\0' != spl_str[pos]) {
			if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z') {
				spl_str[pos] = spl_str[pos] - 'a' + 'A';
			}

			if (1 == pos && 'H' == spl_str[pos]) {
				if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) {
					spl_str[pos] = 'h';
				}
			}
			pos++;
		}
	}

	LemmaIdType DictBuilder::sort_lemmas_by_hz()
	{
		if (NULL == lemma_arr_ || 0 == lemma_num_) {
			return 0;
		}

		myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs);

		lemma_arr_[0].idx_by_hz = 1;
		LemmaIdType idx_max = 1;
		for (size_t i = 1; i < lemma_num_; i++) {
			if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i - 1].hanzi_str)) {
				idx_max++;
				lemma_arr_[i].idx_by_hz = idx_max;
			} else {
				idx_max++;
				lemma_arr_[i].idx_by_hz = idx_max;
			}
		}
		return idx_max + 1;
	}

	size_t DictBuilder::build_scis()
	{
		if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_) {
			return 0;
		}

		SpellingTrie &spl_trie = SpellingTrie::get_instance();

		// This first one is blank, because id 0 is invalid.
		scis_[0].freq = 0;
		scis_[0].hz = 0;
		scis_[0].splid.full_splid = 0;
		scis_[0].splid.half_splid = 0;
		scis_num_ = 1;

		// Copy the hanzis to the buffer
		for (size_t pos = 0; pos < lemma_num_; pos++) {
			size_t hz_num = lemma_arr_[pos].hz_str_len;
			for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
				scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos];
				scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
				scis_[scis_num_].splid.half_splid =
				    spl_trie.full_to_half(scis_[scis_num_].splid.full_splid);
				if (1 == hz_num) {
					scis_[scis_num_].freq = lemma_arr_[pos].freq;
				} else {
					scis_[scis_num_].freq = 0.000001;
				}
				scis_num_++;
			}
		}

		myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq);

		// Remove repeated items
		size_t unique_scis_num = 1;
		for (size_t pos = 1; pos < scis_num_; pos++) {
			if (scis_[pos].hz == scis_[pos - 1].hz &&
			        scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid) {
				continue;
			}
			scis_[unique_scis_num] = scis_[pos];
			scis_[unique_scis_num].splid.half_splid =
			    spl_trie.full_to_half(scis_[pos].splid.full_splid);
			unique_scis_num++;
		}

		scis_num_ = unique_scis_num;

		// Update the lemma list.
		for (size_t pos = 0; pos < lemma_num_; pos++) {
			size_t hz_num = lemma_arr_[pos].hz_str_len;
			for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
				SingleCharItem key;
				key.hz = lemma_arr_[pos].hanzi_str[hzpos];
				key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
				key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid);

				SingleCharItem *found;
				found = static_cast<SingleCharItem *>(mybsearch(&key, scis_,
				                                      unique_scis_num,
				                                      sizeof(SingleCharItem),
				                                      cmp_scis_hz_splid));

				assert(found);

				lemma_arr_[pos].hanzi_scis_ids[hzpos] =
				    static_cast<uint16>(found - scis_);
				lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid;
			}
		}

		return scis_num_;
	}

	bool DictBuilder::construct_subset(void *parent, LemmaEntry *lemma_arr,
	                                   size_t item_start, size_t item_end,
	                                   size_t level)
	{
		if (level >= kMaxLemmaSize || item_end <= item_start) {
			return false;
		}

		// 1. Scan for how many sons
		size_t parent_son_num = 0;
		// LemmaNode *son_1st = NULL;
		// parent.num_of_son = 0;

		LemmaEntry *lma_last_start = lemma_arr_ + item_start;
		uint16 spl_idx_node = lma_last_start->spl_idx_arr[level];

		// Scan for how many sons to be allocaed
		for (size_t i = item_start + 1; i < item_end; i++) {
			LemmaEntry *lma_current = lemma_arr + i;
			uint16 spl_idx_current = lma_current->spl_idx_arr[level];
			if (spl_idx_current != spl_idx_node) {
				parent_son_num++;
				spl_idx_node = spl_idx_current;
			}
		}
		parent_son_num++;

#ifdef ___DO_STATISTICS___
		// Use to indicate whether all nodes of this layer have no son.
		bool allson_noson = true;

		assert(level < kMaxLemmaSize);
		if (parent_son_num > max_sonbuf_len_[level]) {
			max_sonbuf_len_[level] = parent_son_num;
		}

		total_son_num_[level] += parent_son_num;
		total_sonbuf_num_[level] += 1;

		if (parent_son_num == 1) {
			sonbufs_num1_++;
		} else {
			sonbufs_numgt1_++;
		}
		total_lma_node_num_ += parent_son_num;
#endif

		// 2. Update the parent's information
		//    Update the parent's son list;
		LmaNodeLE0 *son_1st_le0 = NULL;  // only one of le0 or ge1 is used
		LmaNodeGE1 *son_1st_ge1 = NULL;  // only one of le0 or ge1 is used.
		if (0 == level) {                 // the parent is root
			(static_cast<LmaNodeLE0 *>(parent))->son_1st_off =
			    lma_nds_used_num_le0_;
			son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_;
			lma_nds_used_num_le0_ += parent_son_num;

			assert(parent_son_num <= 65535);
			(static_cast<LmaNodeLE0 *>(parent))->num_of_son =
			    static_cast<uint16>(parent_son_num);
		} else if (1 == level) {  // the parent is a son of root
			(static_cast<LmaNodeLE0 *>(parent))->son_1st_off =
			    lma_nds_used_num_ge1_;
			son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
			lma_nds_used_num_ge1_ += parent_son_num;

			assert(parent_son_num <= 65535);
			(static_cast<LmaNodeLE0 *>(parent))->num_of_son =
			    static_cast<uint16>(parent_son_num);
		} else {
			set_son_offset((static_cast<LmaNodeGE1 *>(parent)),
			               lma_nds_used_num_ge1_);
			son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
			lma_nds_used_num_ge1_ += parent_son_num;

			assert(parent_son_num <= 255);
			(static_cast<LmaNodeGE1 *>(parent))->num_of_son =
			    (unsigned char)parent_son_num;
		}

		// 3. Now begin to construct the son one by one
		size_t son_pos = 0;

		lma_last_start = lemma_arr_ + item_start;
		spl_idx_node = lma_last_start->spl_idx_arr[level];

		size_t homo_num = 0;
		if (lma_last_start->spl_idx_arr[level + 1] == 0) {
			homo_num = 1;
		}

		size_t item_start_next = item_start;

		for (size_t i = item_start + 1; i < item_end; i++) {
			LemmaEntry *lma_current = lemma_arr_ + i;
			uint16 spl_idx_current = lma_current->spl_idx_arr[level];

			if (spl_idx_current == spl_idx_node) {
				if (lma_current->spl_idx_arr[level + 1] == 0) {
					homo_num++;
				}
			} else {
				// Construct a node
				LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
				LmaNodeGE1 *node_cur_ge1 = NULL;
				if (0 == level) {
					node_cur_le0 = son_1st_le0 + son_pos;
					node_cur_le0->spl_idx = spl_idx_node;
					node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
					node_cur_le0->son_1st_off = 0;
					homo_idx_num_eq1_ += homo_num;
				} else {
					node_cur_ge1 = son_1st_ge1 + son_pos;
					node_cur_ge1->spl_idx = spl_idx_node;

					set_homo_id_buf_offset(node_cur_ge1,
					                       (homo_idx_num_eq1_ + homo_idx_num_gt1_));
					set_son_offset(node_cur_ge1, 0);
					homo_idx_num_gt1_ += homo_num;
				}

				if (homo_num > 0) {
					LemmaIdType *idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
					                       homo_idx_num_gt1_ - homo_num;
					if (0 == level) {
						assert(homo_num <= 65535);
						node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
					} else {
						assert(homo_num <= 255);
						node_cur_ge1->num_of_homo = (unsigned char)homo_num;
					}

					for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
						idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz;
					}

#ifdef ___DO_STATISTICS___
					if (homo_num > max_homobuf_len_[level]) {
						max_homobuf_len_[level] = homo_num;
					}

					total_homo_num_[level] += homo_num;
#endif
				}

				if (i - item_start_next > homo_num) {
					void *next_parent;
					if (0 == level) {
						next_parent = static_cast<void *>(node_cur_le0);
					} else {
						next_parent = static_cast<void *>(node_cur_ge1);
					}
					construct_subset(next_parent, lemma_arr,
					                 item_start_next + homo_num, i, level + 1);
#ifdef ___DO_STATISTICS___

					total_node_hasson_[level] += 1;
					allson_noson = false;
#endif
				}

				// for the next son
				lma_last_start = lma_current;
				spl_idx_node = spl_idx_current;
				item_start_next = i;
				homo_num = 0;
				if (lma_current->spl_idx_arr[level + 1] == 0) {
					homo_num = 1;
				}

				son_pos++;
			}
		}

		// 4. The last one to construct
		LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
		LmaNodeGE1 *node_cur_ge1 = NULL;
		if (0 == level) {
			node_cur_le0 = son_1st_le0 + son_pos;
			node_cur_le0->spl_idx = spl_idx_node;
			node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
			node_cur_le0->son_1st_off = 0;
			homo_idx_num_eq1_ += homo_num;
		} else {
			node_cur_ge1 = son_1st_ge1 + son_pos;
			node_cur_ge1->spl_idx = spl_idx_node;

			set_homo_id_buf_offset(node_cur_ge1,
			                       (homo_idx_num_eq1_ + homo_idx_num_gt1_));
			set_son_offset(node_cur_ge1, 0);
			homo_idx_num_gt1_ += homo_num;
		}

		if (homo_num > 0) {
			LemmaIdType *idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
			                       homo_idx_num_gt1_ - homo_num;
			if (0 == level) {
				assert(homo_num <= 65535);
				node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
			} else {
				assert(homo_num <= 255);
				node_cur_ge1->num_of_homo = (unsigned char)homo_num;
			}

			for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
				idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz;
			}

#ifdef ___DO_STATISTICS___
			if (homo_num > max_homobuf_len_[level]) {
				max_homobuf_len_[level] = homo_num;
			}

			total_homo_num_[level] += homo_num;
#endif
		}

		if (item_end - item_start_next > homo_num) {
			void *next_parent;
			if (0 == level) {
				next_parent = static_cast<void *>(node_cur_le0);
			} else {
				next_parent = static_cast<void *>(node_cur_ge1);
			}
			construct_subset(next_parent, lemma_arr,
			                 item_start_next + homo_num, item_end, level + 1);
#ifdef ___DO_STATISTICS___

			total_node_hasson_[level] += 1;
			allson_noson = false;
#endif
		}

#ifdef ___DO_STATISTICS___
		if (allson_noson) {
			total_sonbuf_allnoson_[level] += 1;
			total_node_in_sonbuf_allnoson_[level] += parent_son_num;
		}
#endif

		assert(son_pos + 1 == parent_son_num);
		return true;
	}

#ifdef ___DO_STATISTICS___
	void DictBuilder::stat_init()
	{
		memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
		memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
		memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize);
		memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize);
		memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize);
		memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
		memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
		memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize);

		sonbufs_num1_ = 0;
		sonbufs_numgt1_ = 0;
		total_lma_node_num_ = 0;
	}

	void DictBuilder::stat_print()
	{
		printf("\n------------STAT INFO-------------\n");
		printf("[root is layer -1]\n");
		printf(".. max_sonbuf_len per layer(from layer 0):\n   ");
		for (size_t i = 0; i < kMaxLemmaSize; i++) {
			printf("%zd, ", max_sonbuf_len_[i]);
		}
		printf("-, \n");

		printf(".. max_homobuf_len per layer:\n   -, ");
		for (size_t i = 0; i < kMaxLemmaSize; i++) {
			printf("%zd, ", max_homobuf_len_[i]);
		}
		printf("\n");

		printf(".. total_son_num per layer:\n   ");
		for (size_t i = 0; i < kMaxLemmaSize; i++) {
			printf("%zd, ", total_son_num_[i]);
		}
		printf("-, \n");

		printf(".. total_node_hasson per layer:\n   1, ");
		for (size_t i = 0; i < kMaxLemmaSize; i++) {
			printf("%zd, ", total_node_hasson_[i]);
		}
		printf("\n");

		printf(".. total_sonbuf_num per layer:\n   ");
		for (size_t i = 0; i < kMaxLemmaSize; i++) {
			printf("%zd, ", total_sonbuf_num_[i]);
		}
		printf("-, \n");

		printf(".. total_sonbuf_allnoson per layer:\n   ");
		for (size_t i = 0; i < kMaxLemmaSize; i++) {
			printf("%zd, ", total_sonbuf_allnoson_[i]);
		}
		printf("-, \n");

		printf(".. total_node_in_sonbuf_allnoson per layer:\n   ");
		for (size_t i = 0; i < kMaxLemmaSize; i++) {
			printf("%zd, ", total_node_in_sonbuf_allnoson_[i]);
		}
		printf("-, \n");

		printf(".. total_homo_num per layer:\n   0, ");
		for (size_t i = 0; i < kMaxLemmaSize; i++) {
			printf("%zd, ", total_homo_num_[i]);
		}
		printf("\n");

		printf(".. son buf allocation number with only 1 son: %zd\n", sonbufs_num1_);
		printf(".. son buf allocation number with more than 1 son: %zd\n",
		       sonbufs_numgt1_);
		printf(".. total lemma node number: %zd\n", total_lma_node_num_ + 1);
	}
#endif  // ___DO_STATISTICS___

#endif  // ___BUILD_MODEL___
}  // namespace ime_pinyin
